knitr::opts_chunk$set(echo = TRUE)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(stringr)
library(jsonlite)
library(httr)
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:httr':
##
## config
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(ggplot2)
library(ggmap)
## ℹ Google's Terms of Service: <https://mapsplatform.google.com>
## ℹ Please cite ggmap if you use it! Use `citation("ggmap")` for details.
##
## Attaching package: 'ggmap'
## The following object is masked from 'package:plotly':
##
## wind
system("curl 'https://data.cityofnewyork.us/resource/76xm-jjuj.csv?$limit=2300000' -o nyc_ems.csv")
data <- read.csv("nyc_ems.csv")
head(data)
## cad_incident_id incident_datetime initial_call_type
## 1 220904738 2022-03-31T23:59:52.000 STNDBY
## 2 220904735 2022-03-31T23:59:45.000 DRUG
## 3 220904734 2022-03-31T23:59:02.000 DRUG
## 4 220904733 2022-03-31T23:58:37.000 TRAUMA
## 5 220904732 2022-03-31T23:58:19.000 SICK
## 6 220904731 2022-03-31T23:58:10.000 INJURY
## initial_severity_level_code final_call_type final_severity_level_code
## 1 8 STNDBY 8
## 2 4 DRUG 4
## 3 4 DRUG 4
## 4 2 TRAUMA 2
## 5 6 SICK 6
## 6 5 INJURY 5
## first_assignment_datetime valid_dispatch_rspns_time_indc
## 1 N
## 2 2022-03-31T23:59:45.000 Y
## 3 2022-03-31T23:59:13.000 Y
## 4 2022-03-31T23:59:10.000 Y
## 5 2022-03-31T23:59:20.000 Y
## 6 2022-03-31T23:58:44.000 Y
## dispatch_response_seconds_qy first_activation_datetime
## 1 0
## 2 0 2022-03-31T23:59:45.000
## 3 11 2022-03-31T23:59:19.000
## 4 33 2022-03-31T23:59:47.000
## 5 61 2022-03-31T23:59:34.000
## 6 34 2022-03-31T23:58:53.000
## first_on_scene_datetime valid_incident_rspns_time_indc
## 1 N
## 2 2022-03-31T23:59:45.000 Y
## 3 2022-04-01T00:02:02.000 Y
## 4 2022-04-01T00:01:27.000 Y
## 5 2022-04-01T00:17:19.000 Y
## 6 2022-04-01T00:08:01.000 Y
## incident_response_seconds_qy incident_travel_tm_seconds_qy
## 1 NA NA
## 2 0 0
## 3 180 169
## 4 170 137
## 5 1140 1079
## 6 591 557
## first_to_hosp_datetime first_hosp_arrival_datetime incident_close_datetime
## 1 2022-03-31T23:59:52.000
## 2 2022-04-01T00:01:50.000 2022-04-01T00:06:15.000 2022-04-01T00:45:52.000
## 3 2022-04-01T00:15:33.000 2022-04-01T00:20:22.000 2022-04-01T00:55:56.000
## 4 2022-04-01T00:12:02.000
## 5 2022-04-01T00:23:06.000 2022-04-01T00:37:57.000 2022-04-01T01:05:19.000
## 6 2022-04-01T00:22:34.000 2022-04-01T00:34:26.000 2022-04-01T00:51:18.000
## held_indicator incident_disposition_code borough incident_dispatch_area
## 1 N NA MANHATTAN M7
## 2 N 82 BRONX B2
## 3 N 82 MANHATTAN M3
## 4 N 90 BRONX B4
## 5 N 82 BROOKLYN K4
## 6 N 82 MANHATTAN M2
## zipcode policeprecinct citycouncildistrict communitydistrict
## 1 10035 25 8 111
## 2 10458 48 15 206
## 3 10019 18 3 104
## 4 10470 47 11 212
## 5 11212 73 41 316
## 6 10014 6 3 102
## communityschooldistrict congressionaldistrict reopen_indicator
## 1 4 12 N
## 2 10 15 N
## 3 2 10 N
## 4 11 16 N
## 5 23 9 N
## 6 2 10 N
## special_event_indicator standby_indicator transfer_indicator
## 1 N Y N
## 2 N N N
## 3 N N N
## 4 N N N
## 5 N N N
## 6 N N N
#Checking the dimesnions of the data
dim(data)
## [1] 2300000 31
colnames(data)
## [1] "cad_incident_id" "incident_datetime"
## [3] "initial_call_type" "initial_severity_level_code"
## [5] "final_call_type" "final_severity_level_code"
## [7] "first_assignment_datetime" "valid_dispatch_rspns_time_indc"
## [9] "dispatch_response_seconds_qy" "first_activation_datetime"
## [11] "first_on_scene_datetime" "valid_incident_rspns_time_indc"
## [13] "incident_response_seconds_qy" "incident_travel_tm_seconds_qy"
## [15] "first_to_hosp_datetime" "first_hosp_arrival_datetime"
## [17] "incident_close_datetime" "held_indicator"
## [19] "incident_disposition_code" "borough"
## [21] "incident_dispatch_area" "zipcode"
## [23] "policeprecinct" "citycouncildistrict"
## [25] "communitydistrict" "communityschooldistrict"
## [27] "congressionaldistrict" "reopen_indicator"
## [29] "special_event_indicator" "standby_indicator"
## [31] "transfer_indicator"
#Removing the unnecessary columns not required for our use case.
clean_data <- data %>%
select(-c(valid_dispatch_rspns_time_indc,first_activation_datetime,valid_incident_rspns_time_indc,borough,incident_dispatch_area,policeprecinct,citycouncildistrict,communitydistrict,communityschooldistrict,congressionaldistrict,first_on_scene_datetime,first_to_hosp_datetime,first_hosp_arrival_datetime))
#Renaming the column names accordingly for ease.
clean_data <- clean_data %>%
rename(incident_id = cad_incident_id,initial_call_reason = initial_call_type,final_call_reason=final_call_type,time_elapsed_assignment=dispatch_response_seconds_qy,
time_incident_response=incident_response_seconds_qy,time_incident_travel=incident_travel_tm_seconds_qy)
#Checking for all the NA values
sapply(clean_data, function(x) sum(is.na(x)))
## incident_id incident_datetime
## 0 0
## initial_call_reason initial_severity_level_code
## 0 0
## final_call_reason final_severity_level_code
## 0 0
## first_assignment_datetime time_elapsed_assignment
## 0 0
## time_incident_response time_incident_travel
## 68763 68415
## incident_close_datetime held_indicator
## 0 0
## incident_disposition_code zipcode
## 13864 33737
## reopen_indicator special_event_indicator
## 0 0
## standby_indicator transfer_indicator
## 0 0
#Removing rows having NA values as we would still have a large sample of data.
clean_data<- na.omit(clean_data)
#Checking the dimesnion of cleaned data
dim(clean_data)
## [1] 2198972 18
#Converting columns in data-time format as they were in string format.
clean_data <- clean_data %>%
mutate(incident_datetime = ymd_hms(incident_datetime),
incident_close_datetime = ymd_hms(incident_close_datetime),
first_assignment_datetime=ymd_hms(first_assignment_datetime),)
# count number of rows where incident_close_datetime is before or equal to incident_datetime
sum(clean_data$first_assignment_datetime < clean_data$incident_datetime, na.rm = TRUE)
## [1] 0
# count number of rows where incident_close_datetime is before or equal to incident_datetime
sum(clean_data$incident_close_datetime < clean_data$first_assignment_datetime, na.rm = TRUE)
## [1] 52
clean_data <- clean_data %>%
filter(incident_close_datetime > first_assignment_datetime)
# Count the number of duplicate incident IDs
sum(duplicated(clean_data$incident_id))
## [1] 14
# Remove the duplicate incidents
clean_data <- distinct(clean_data, incident_id, .keep_all = TRUE)
clean_data$initial_severity_level_code <- as.integer(clean_data$initial_severity_level_code)
clean_data$final_severity_level_code <- as.integer(clean_data$final_severity_level_code)
# Check for out-of-range values in initial_severity_level_code
sum(!between(clean_data$initial_severity_level_code, 1, 8))
## [1] 7
# Check for out-of-range values in final_severity_level_code
sum(!between(clean_data$final_severity_level_code, 1, 8))
## [1] 0
# Filter the rows where initial_severity_level_code is out of range
clean_data %>%
filter(initial_severity_level_code < 1 | initial_severity_level_code > 8) %>%
select(initial_severity_level_code)
## initial_severity_level_code
## 1 9
## 2 9
## 3 9
## 4 9
## 5 9
## 6 9
## 7 9
clean_data <- clean_data %>%
filter(initial_severity_level_code >= 1 & initial_severity_level_code <= 8)
table(clean_data$held_indicator[!clean_data$held_indicator %in% c("N", "Y")])
## < table of extent 0 >
table(clean_data$reopen_indicator[!clean_data$reopen_indicator %in% c("N", "Y")])
## < table of extent 0 >
table(clean_data$special_event_indicator[!clean_data$special_event_indicator %in% c("N", "Y")])
## < table of extent 0 >
table(clean_data$standby_indicator[!clean_data$standby_indicator %in% c("N", "Y")])
## < table of extent 0 >
table(clean_data$transfer_indicator[!clean_data$transfer_indicator %in% c("N", "Y")])
## < table of extent 0 >
# Define the regex pattern for a valid US zip code
zip_pattern <- "^[0-9]{5}(?:-[0-9]{4})?$"
# Apply the regex pattern to the zipcode column in clean_data
length(which(!str_detect(clean_data$zipcode, zip_pattern)))
## [1] 3
clean_data %>%
filter(!str_detect(zipcode, "^\\d{5}(-\\d{4})?$")) %>%
select(zipcode)
## zipcode
## 1 83
## 2 83
## 3 83
clean_data <- clean_data %>%
filter(grepl("^\\d{5}(-\\d{4})?$", zipcode))
clean_data$time_elapsed_assignment <- as.integer(clean_data$time_elapsed_assignment)
clean_data$time_incident_response<- as.integer(clean_data$time_incident_response)
clean_data$time_incident_travel <- as.integer(clean_data$time_incident_travel)
clean_data <- subset(clean_data, incident_disposition_code %in% c("82", "83","87","90","91","92","93","94","95","96","CANCEL","DUP","NOTSNT","ZZZZZZ"))
code_meaning_map <- c("82" = "transporting patient", "83" = "patient pronounced dead", "87" = "cancelled", "90" = "unfounded", "91" = "condition corrected", "92" = "treated not transported", "93" = "refused medical aid", "94" = "treated and transported", "95" = "triaged at scene no transport", "96" = "patient gone on arrival", "CANCEL" = "cancelled", "DUP" = "duplicate incident", "NOTSNT" = "unit not sent", "ZZZZZZ" = "no disposition")
clean_data <- mutate(clean_data, incident_disposition_code_meaning = code_meaning_map[as.character(incident_disposition_code)])
str(clean_data)
## 'data.frame': 2198698 obs. of 19 variables:
## $ incident_id : int 220904735 220904734 220904733 220904732 220904731 220904730 220904729 220904728 220904727 220904726 ...
## $ incident_datetime : POSIXct, format: "2022-03-31 23:59:45" "2022-03-31 23:59:02" ...
## $ initial_call_reason : chr "DRUG" "DRUG" "TRAUMA" "SICK" ...
## $ initial_severity_level_code : int 4 4 2 6 5 2 3 3 2 7 ...
## $ final_call_reason : chr "DRUG" "DRUG" "TRAUMA" "SICK" ...
## $ final_severity_level_code : int 4 4 2 6 5 2 3 3 2 7 ...
## $ first_assignment_datetime : POSIXct, format: "2022-03-31 23:59:45" "2022-03-31 23:59:13" ...
## $ time_elapsed_assignment : int 0 11 33 61 34 24 10 26 22 11 ...
## $ time_incident_response : int 0 180 170 1140 591 409 359 433 455 203 ...
## $ time_incident_travel : int 0 169 137 1079 557 385 349 407 433 192 ...
## $ incident_close_datetime : POSIXct, format: "2022-04-01 00:45:52" "2022-04-01 00:55:56" ...
## $ held_indicator : chr "N" "N" "N" "N" ...
## $ incident_disposition_code : int 82 82 90 82 82 82 82 82 90 82 ...
## $ zipcode : int 10458 10019 10470 11212 10014 11373 10003 10027 11375 10016 ...
## $ reopen_indicator : chr "N" "N" "N" "N" ...
## $ special_event_indicator : chr "N" "N" "N" "N" ...
## $ standby_indicator : chr "N" "N" "N" "N" ...
## $ transfer_indicator : chr "N" "N" "N" "N" ...
## $ incident_disposition_code_meaning: Named chr "transporting patient" "transporting patient" "unfounded" "transporting patient" ...
## ..- attr(*, "names")= chr [1:2198698] "82" "82" "90" "82" ...
write.csv(clean_data, file = "clean_data.csv", row.names = TRUE)
#We need to catagorize the call time in 4 categorize: Morning, Noon, Evening & Night.
# Create function to categorize call time
categorize_call_time <- function(call_time) {
hour <- as.numeric(format(call_time, "%H"))
if (hour >= 5 && hour < 12) {
return("Morning")
} else if (hour >= 12 && hour < 17) {
return("Noon")
} else if (hour >= 17 && hour < 21) {
return("Evening")
} else {
return("Night")
}
}
# Apply function to incident_datetime column to create new column
clean_data$call_time_category <- sapply(clean_data$incident_datetime, categorize_call_time)
#Displaying the time category having the most incident_datetime.
clean_data %>%
group_by(call_time_category) %>%
summarize(count = n()) %>%
arrange(desc(count))
## # A tibble: 4 × 2
## call_time_category count
## <chr> <int>
## 1 Night 604854
## 2 Noon 585859
## 3 Morning 554847
## 4 Evening 453138
#Plotting a pie chart showing % of incidents in each call time category.
# Calculate percentages for each category
percentages <- clean_data %>%
group_by(call_time_category) %>%
summarize(count = n()) %>%
mutate(percent = count / sum(count) * 100)
# Create pie chart with percentages
ggplot(percentages, aes(x = "", y = percent, fill = call_time_category)) +
geom_bar(stat = "identity", width = 1, color = "white") +
coord_polar("y", start = 0) +
theme_void() +
theme(legend.position = "bottom") +
geom_text(aes(label = paste0(round(percent), "%")), position = position_stack(vjust = 0.5)) +
labs(title = "Call Time Category Distribution", fill = "Call Time Category")
# Group the data by time category
response_time_by_time_category <- clean_data %>%
group_by(call_time_category) %>%
# Calculate the average response time
summarize(avg_response_time = mean(time_incident_response, na.rm = TRUE)) %>%
# Sort the data by average response time in ascending order
arrange(avg_response_time)
# View the resulting data
response_time_by_time_category
## # A tibble: 4 × 2
## call_time_category avg_response_time
## <chr> <dbl>
## 1 Evening 503.
## 2 Noon 510.
## 3 Morning 511.
## 4 Night 528.
ggplot(clean_data, aes(x = call_time_category, y = time_incident_response)) +
geom_boxplot() +
labs(x = "Call Time Category", y = "Time Incident Response (seconds)",
title = "Distribution of Time Incident Response by Call Time Category")
#We are going to analysis the average response and assignment times for different time categories of calls. The response time is the time between the initial call and the first unit being assigned to the incident, while the assignment time is the time elapsed from when the incident is created to the first unit being assigned.
response_time_by_time_category <- clean_data %>%
group_by(call_time_category) %>%
# Calculate the average response time
summarize(avg_response_time = mean(time_incident_response, na.rm = TRUE)) %>%
# Sort the data by average response time in ascending order
arrange(avg_response_time)
# View the resulting data
response_time_by_time_category
## # A tibble: 4 × 2
## call_time_category avg_response_time
## <chr> <dbl>
## 1 Evening 503.
## 2 Noon 510.
## 3 Morning 511.
## 4 Night 528.
#Bar plot showing the average response time for different time categories of calls.
ggplot(response_time_by_time_category, aes(x = call_time_category, y = avg_response_time)) +
geom_bar(stat = "identity", fill = "steelblue") +
labs(title = "Average Response Time by Time Category",
x = "Time Category",
y = "Average Response Time (in seconds)")
assignment_time_by_time_category <- clean_data %>%
group_by(call_time_category) %>%
# Calculate the average response time
summarize(avg_assignment_time = mean(time_elapsed_assignment, na.rm = TRUE)) %>%
# Sort the data by average response time in ascending order
arrange(avg_assignment_time)
# View the resulting data
assignment_time_by_time_category
## # A tibble: 4 × 2
## call_time_category avg_assignment_time
## <chr> <dbl>
## 1 Evening 47.4
## 2 Morning 47.7
## 3 Noon 49.8
## 4 Night 70.6
ggplot(assignment_time_by_time_category, aes(x = call_time_category, y = avg_assignment_time)) +
geom_bar(stat = "identity", fill = "steelblue") +
labs(title = "Average Assignment Time by Time Category",
x = "Time Category",
y = "Average Assignment Time (in seconds)")
#In this use case,the code performs various data analysis and visualization tasks related to call reasons in the dataset clean_data. Key findings such as the top 10 initial call reasons and their frequencies, the number of times initial_call_reason and final_call_reason are not the same, and the top final_call_reasons for specific initial_call_reasons. Plots are generated for each finding
freq_initial_call_reason <- clean_data %>%
group_by(initial_call_reason) %>%
summarise(count = n()) %>%
arrange(desc(count))
top_10_initial_call_reasons <- freq_initial_call_reason %>%
slice_max(count, n = 10)
ggplot(top_10_initial_call_reasons, aes(x = reorder(initial_call_reason, count), y = count)) +
geom_col(fill = "Blue", color = "Black", width = 0.5) +
coord_flip() +
labs(title = "Top 10 Initial Call Reasons",
x = "Initial Call Reason",
y = "Count")
sum(clean_data$initial_call_reason != clean_data$final_call_reason)
## [1] 292790
clean_data %>%
group_by(initial_call_reason) %>%
summarise(num_updates = sum(initial_call_reason != final_call_reason)) %>%
arrange(desc(num_updates))
## # A tibble: 121 × 2
## initial_call_reason num_updates
## <chr> <int>
## 1 EDP 64762
## 2 UNKNOW 42141
## 3 INJURY 23737
## 4 SICK 19523
## 5 UNC 16076
## 6 CARD 10138
## 7 DRUG 8157
## 8 DIFFBR 7901
## 9 CVAC 6759
## 10 EDPC 6715
## # ℹ 111 more rows
clean_data %>%
filter(initial_call_reason == "UNKNOW") %>%
group_by(final_call_reason) %>%
count() %>%
arrange(desc(n)) %>%
head(10) %>%
select(-1)
## Adding missing grouping variables: `final_call_reason`
## # A tibble: 10 × 2
## # Groups: final_call_reason [10]
## final_call_reason n
## <chr> <int>
## 1 UNKNOW 182473
## 2 SICK 5433
## 3 DRUG 4351
## 4 CARD 3632
## 5 UNC 3630
## 6 INJURY 3524
## 7 ALTMEN 3140
## 8 EDP 2056
## 9 ARREST 1548
## 10 INJMAJ 1392
clean_data %>%
filter(initial_call_reason == "EDP") %>%
group_by(final_call_reason) %>%
count() %>%
arrange(desc(n))
## # A tibble: 87 × 2
## # Groups: final_call_reason [87]
## final_call_reason n
## <chr> <int>
## 1 EDP 100731
## 2 EDPC 36017
## 3 EDPM 11844
## 4 DRUG 7614
## 5 SICK 1687
## 6 INJURY 1469
## 7 UNKNOW 1359
## 8 ALTMEN 934
## 9 MCI43 433
## 10 UNC 372
## # ℹ 77 more rows
get_final_reason_data <- function(initial_reason) {
final_reason_data <- clean_data %>%
filter(initial_call_reason == initial_reason) %>%
group_by(final_call_reason) %>%
count() %>%
arrange(desc(n)) %>%
head(10) %>%
filter(final_call_reason != initial_reason)
ggplot(final_reason_data, aes(x = reorder(final_call_reason, -n), y = n, fill = final_call_reason)) +
geom_bar(stat = "identity") +
scale_fill_brewer(palette = "Set1") +
theme_minimal() +
labs(title = paste0(initial_reason," Calls by Final Reason"),
x = "Final Call Reason",
y = "Count")
}
# Call the function with input "EDP"
get_final_reason_data("EDP")
get_final_reason_data("UNKNOW")
get_final_reason_data("INJURY")
#For this use case, zipcodes were identified according to the highest incident count and also according to the highest response times. This allows us to find the hotstop zones by incidents and by response times. The zones are then plotted on the google map using google cloud API key
clean_data %>%
group_by(zipcode) %>%
summarize(avg_response_time = mean(time_incident_response, na.rm = TRUE)) %>%
arrange(desc(avg_response_time)) %>%
head(10)
## # A tibble: 10 × 2
## zipcode avg_response_time
## <int> <dbl>
## 1 11697 929.
## 2 10035 763.
## 3 11695 717.
## 4 11208 715.
## 5 10034 714.
## 6 11371 710.
## 7 10080 700.
## 8 10039 699.
## 9 10040 683.
## 10 10030 664.
hotspot_zones <- clean_data %>%
group_by(zipcode) %>%
summarize(incident_count=n(),avg_response_time = mean(time_incident_response, na.rm = TRUE)) %>%
arrange(desc(incident_count)) %>%
head(10)
avg_response_time <- clean_data %>%
summarize(mean_time_incident_response = mean(time_incident_response, na.rm = TRUE))
# Print the result
avg_response_time
## mean_time_incident_response
## 1 513.5109
hotspot_zones
## # A tibble: 10 × 3
## zipcode incident_count avg_response_time
## <int> <int> <dbl>
## 1 10456 41777 500.
## 2 11212 38540 533.
## 3 11207 37331 601.
## 4 10457 34870 484.
## 5 10467 34458 553.
## 6 10029 31882 599.
## 7 10458 31661 471.
## 8 10453 31003 531.
## 9 10035 30483 763.
## 10 10451 29889 478.
#Hotspot zones by incident counts
register_google(key = "AIzaSyCTc_Fkz9Q7QRAeqWlOfFevj3BpEC8k8BQ")
# Create a dataframe with the top 10 zip codes and their incident counts
top_zipcodes <- head(hotspot_zones, 10)
# Use ggmap to get the geographical coordinates for each zip code
top_zipcodes <- cbind(top_zipcodes, geocode(as.character(top_zipcodes$zipcode)))
## ℹ <https://maps.googleapis.com/maps/api/geocode/json?address=10456&key=xxx>
## ℹ <https://maps.googleapis.com/maps/api/geocode/json?address=11212&key=xxx>
## ℹ <https://maps.googleapis.com/maps/api/geocode/json?address=11207&key=xxx>
## ℹ <https://maps.googleapis.com/maps/api/geocode/json?address=10457&key=xxx>
## ℹ <https://maps.googleapis.com/maps/api/geocode/json?address=10467&key=xxx>
## ℹ <https://maps.googleapis.com/maps/api/geocode/json?address=10029&key=xxx>
## ℹ <https://maps.googleapis.com/maps/api/geocode/json?address=10458&key=xxx>
## ℹ <https://maps.googleapis.com/maps/api/geocode/json?address=10453&key=xxx>
## ℹ <https://maps.googleapis.com/maps/api/geocode/json?address=10035&key=xxx>
## ℹ <https://maps.googleapis.com/maps/api/geocode/json?address=10451&key=xxx>
# Create a map centered on New York City
ny_map <- get_map(location = "new york city", zoom = 11)
## ℹ <https://maps.googleapis.com/maps/api/staticmap?center=new%20york%20city&zoom=11&size=640x640&scale=2&maptype=terrain&language=en-EN&key=xxx>
## ℹ <https://maps.googleapis.com/maps/api/geocode/json?address=new+york+city&key=xxx>
# Plot the zip codes on the map with circles proportional to their incident counts, and make the plot interactive using plotly
gg <- ggmap(ny_map) +
geom_point(data = top_zipcodes, aes(x = lon, y = lat, size = incident_count, text = paste("Zip code:", zipcode, "<br>", "Incident count:", incident_count)), color = "red", alpha = 0.8) +
scale_size_continuous(range = c(3, 10)) +
ggtitle("Top 10 Zip Codes with the Most Incidents") +
xlab("Longitude") +
ylab("Latitude") +
guides(size = guide_legend(title = "Incident Count")) +
theme(plot.title = element_text(size = 14, face = "bold"))
## Warning in geom_point(data = top_zipcodes, aes(x = lon, y = lat, size =
## incident_count, : Ignoring unknown aesthetics: text
ggplotly(gg, tooltip = c("text"), height = 600, width = 800)
#Checking for the Hotspots by incident counts if they are able to cope up with the high counts. #In the plot, the line is the average reponse time for the whole NYC and green bars indicate that they did well and re indicates that the regions were not able to cope up and there is a scope for improvement.
# create a vector of the given values
values <- hotspot_zones$avg_response_time
# set the reference value
ref_value <- 513.5109
# create a vector of colors for bars
colors <- ifelse(values > ref_value, "red", "green")
#png("plot.png", width = 1200, height = 800, res = 150)
# create a bar plot
barplot(values, names.arg = hotspot_zones$zipcode, col = colors, ylim = c(0, max(values) * 1.2), ylab = "Values")
# add a reference line
abline(h = ref_value, lty = 2, col = "blue")
barplot
## function (height, ...)
## UseMethod("barplot")
## <bytecode: 0x13b2a2e08>
## <environment: namespace:graphics>
#Hotstop zones by highest response times
register_google(key = "AIzaSyCTc_Fkz9Q7QRAeqWlOfFevj3BpEC8k8BQ")
# Create a dataframe with the top 10 zip codes and their incident counts
top_zipcodes <- head(hotspot_zones, 10)
# Use ggmap to get the geographical coordinates for each zip code
top_zipcodes <- cbind(top_zipcodes, geocode(as.character(top_zipcodes$zipcode)))
## ℹ <https://maps.googleapis.com/maps/api/geocode/json?address=10456&key=xxx>
## ℹ <https://maps.googleapis.com/maps/api/geocode/json?address=11212&key=xxx>
## ℹ <https://maps.googleapis.com/maps/api/geocode/json?address=11207&key=xxx>
## ℹ <https://maps.googleapis.com/maps/api/geocode/json?address=10457&key=xxx>
## ℹ <https://maps.googleapis.com/maps/api/geocode/json?address=10467&key=xxx>
## ℹ <https://maps.googleapis.com/maps/api/geocode/json?address=10029&key=xxx>
## ℹ <https://maps.googleapis.com/maps/api/geocode/json?address=10458&key=xxx>
## ℹ <https://maps.googleapis.com/maps/api/geocode/json?address=10453&key=xxx>
## ℹ <https://maps.googleapis.com/maps/api/geocode/json?address=10035&key=xxx>
## ℹ <https://maps.googleapis.com/maps/api/geocode/json?address=10451&key=xxx>
# Create a map centered on New York City
ny_map <- get_map(location = "new york city", zoom = 11)
## ℹ <https://maps.googleapis.com/maps/api/staticmap?center=new%20york%20city&zoom=11&size=640x640&scale=2&maptype=terrain&language=en-EN&key=xxx>
## ℹ <https://maps.googleapis.com/maps/api/geocode/json?address=new+york+city&key=xxx>
# Plot the zip codes on the map with circles proportional to their incident counts, and make the plot interactive using plotly
gg <- ggmap(ny_map) +
geom_point(data = top_zipcodes, aes(x = lon, y = lat, size = avg_response_time, text = paste("Zip code:", zipcode, "<br>", "Response Times:", avg_response_time)), color = "red", alpha = 0.8) +
scale_size_continuous(range = c(3, 10)) +
ggtitle("Top 10 Zip Codes with the Most Incidents") +
xlab("Longitude") +
ylab("Latitude") +
guides(size = guide_legend(title = "REsponse Times")) +
theme(plot.title = element_text(size = 14, face = "bold"))
## Warning in geom_point(data = top_zipcodes, aes(x = lon, y = lat, size =
## avg_response_time, : Ignoring unknown aesthetics: text
ggplotly(gg, tooltip = c("text"), height = 600, width = 800)
#In this use case, we are generating a pie chart to show the distribution of incident_disposition_code_meaning in the clean_data dataset. The pie chart is a useful way to visualize the relative frequencies of different incident_disposition_code_meanings in the dataset and also supports our finding.
incident_disposition_count <- table(clean_data$incident_disposition_code_meaning)
incident_disposition_percent <- round(prop.table(incident_disposition_count) * 100, 2)
plot_ly(labels = names(incident_disposition_count), values = incident_disposition_count, type = "pie",
textinfo = "label+percent",
text = paste(names(incident_disposition_count), "(", incident_disposition_percent, "%)"))